Exploatory data analysis

majors_processed %>%
  count(Major_category, sort = T)
## # A tibble: 16 x 2
##    Major_category                          n
##    <chr>                               <int>
##  1 Engineering                            29
##  2 Education                              16
##  3 Humanities & Liberal Arts              15
##  4 Biology & Life Science                 14
##  5 Business                               13
##  6 Health                                 12
##  7 Computers & Mathematics                11
##  8 Agriculture & Natural Resources        10
##  9 Physical Sciences                      10
## 10 Psychology & Social Work                9
## 11 Social Science                          9
## 12 Arts                                    8
## 13 Industrial Arts & Consumer Services     7
## 14 Law & Public Policy                     5
## 15 Communications & Journalism             4
## 16 Interdisciplinary                       1
by_major_category <- majors_processed %>% 
  filter(!is.na(Total)) %>% 
  group_by(Major_category) %>% 
  summarize(Men = sum(Men),
    Women = sum(Women),
    Total = sum(Total),
    MedianSalary = sum(Median*Sample_size)/sum(Sample_size)) %>% 
  mutate(ShareWomen = Women / Total) %>% 
  arrange(desc(ShareWomen))
majors_processed %>%
  ggplot(aes(Median)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Metallurgical gender mix

library(ggrepel)
majors_processed %>%
  mutate(Major_category=fct_reorder(Major_category,ShareWomen)) %>% 
  ggplot(aes(fct_lump(Major_category, 20), ShareWomen, fill =
               Major_category),
         label=ShareWomen) +
  geom_boxplot(show.legend = F) +
  coord_flip() +
  theme_bw() +
  geom_hline(yintercept = 0.153, lty = 2)

Comparing income distributions across major category

majors_processed %>% # BOXPLOT COMPARING EARNINGS ACROSS MAJOR CATEGORY
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category, Median,fill=Major_category)) +
  geom_boxplot(show.legend = F) +
  coord_flip() +
  scale_y_log10(labels = scales::dollar_format())

Median earnings across major categories

majors_processed %>% 
  group_by(Major_category) %>% 
  summarise(Median=median(Median)) %>% 
  mutate(Major_category = fct_reorder(Major_category, Median)) %>%
  ggplot(aes(Major_category,Median,fill=Major_category))+
  geom_col(show.legend = FALSE)+
  coord_flip()

What are the highest earning majors?

majors_processed %>% 
  arrange(desc(Median)) %>% 
  select(Major,Major_category,Median,P25th,P75th) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major,Median)) %>% 
  ggplot(aes(Major,Median,colour=Major_category))+
  geom_point()+
  coord_flip()+
  geom_errorbar(aes(ymin=P25th,ymax=P75th))+
  expand_limits(y=0)

Business subjects - Highest earning majors

majors_processed %>% 
  filter(Major_category=='Business') %>% 
  mutate(Major=fct_reorder(Major,Median)) %>% 
  ggplot(aes(Major,Median,fill=Major)) +
  geom_col(show.legend=FALSE)+
  coord_flip()

majors_processed %>% 
  filter(Major_category=='Business') %>% 
  mutate(Major=fct_reorder(Major,Median)) %>% 
  ggplot(aes(Major,Median,fill=Major)) +
  geom_col(show.legend=FALSE)+
  coord_flip()

### The greatest sausage fest majors

library(ggrepel)
majors_processed %>% 
  arrange(ShareWomen) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major,ShareWomen)) %>% 
  ggplot(aes(Major,ShareWomen))+
  geom_point()+
  coord_flip()

### Mapping share of women and median earnings per major

majors_processed %>% 
  filter(Sample_size>30) %>% 
  ggplot(aes(desc(ShareWomen),Median))+
  geom_point()+
  scale_y_log10(labels=scales::dollar_format())+
  geom_smooth(method='lm')+
  geom_text_repel(aes(label=Sample_size,alpha=0.2))+
  theme_bw()

summary(lm(Median~ShareWomen,recent_grads))
## 
## Call:
## lm(formula = Median ~ ShareWomen, data = recent_grads)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -17261  -5474  -1007   3502  57604 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    56093       1705   32.90   <2e-16 ***
## ShareWomen    -30670       2987  -10.27   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 9031 on 170 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.3828, Adjusted R-squared:  0.3791 
## F-statistic: 105.4 on 1 and 170 DF,  p-value: < 2.2e-16

What were the most common Majors?

majors_processed %>% 
  mutate(Major=fct_reorder(Major,Total)) %>% 
  arrange(desc(Total)) %>% 
  head(20) %>% 
  ggplot(aes(Major,Total,fill=Major))+
  geom_col()+
  scale_y_continuous(label=scales::comma_format())+
  coord_flip()

How does gener mix realte to typical earnings?

majors_processed %>% 
  arrange(desc(Total)) %>% 
  head(20) %>% 
  mutate(Major=fct_reorder(Major, Total)) %>% 
  gather(Gender, Number, Men, Women) %>% 
  ggplot(aes(Major, Number, fill = Gender))+
  geom_col()+
  scale_y_continuous(label=scales::comma_format())+
  coord_flip()

by_major_category %>% 
  ggplot(aes(ShareWomen,MedianSalary))+
  geom_point()+
  geom_smooth(method="lm")+
  geom_text_repel(aes(label=Major_category),force=0.2)+
    expand_limits(y=0)

library(plotly)
g <- majors_processed %>% 
  mutate(Major_category=fct_lump(Major_category,7)) %>% 
  ggplot(aes(ShareWomen,Median, colour= Major_category,size=Sample_size))+
  geom_point(aes(label=Major))+
  geom_smooth(aes(group=1),method="lm")+
  expand_limits(y=0)+
  scale_y_continuous(labels=scales::dollar_format())+
  scale_x_continuous(labels=scales::percent_format())

ggplotly(g)
majors_processed %>% 
  select(Major, Total, ShareWomen, Sample_size, Median) %>% 
  lm(Median ~ ShareWomen, data = ., weights = Sample_size) %>% 
  summary()
## 
## Call:
## lm(formula = Median ~ ShareWomen, data = ., weights = Sample_size)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -260500  -61042  -13899   33262  865081 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    52073       1436  36.255   <2e-16 ***
## ShareWomen    -23650       2403  -9.842   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 123000 on 170 degrees of freedom
##   (1 observation deleted due to missingness)
## Multiple R-squared:  0.363,  Adjusted R-squared:  0.3592 
## F-statistic: 96.87 on 1 and 170 DF,  p-value: < 2.2e-16
library(broom)
majors_processed %>% 
  select(Major, Major_category, Total, ShareWomen, Sample_size, Median) %>% 
  add_count(Major_category) %>% 
  filter(n>=10) %>% 
  nest(-Major_category) %>% 
  mutate(model = map(data, ~ lm(Median ~ ShareWomen, data = ., weights = Sample_size)),tidied=map(model,tidy)) %>% 
  unnest(tidied) %>% 
  filter(term == "ShareWomen") %>% 
  arrange(estimate) %>% 
  mutate(fdr = p.adjust(p.value,method="fdr"))
## # A tibble: 9 x 9
##   Major_category  data   model term  estimate std.error statistic p.value    fdr
##   <chr>           <list> <lis> <chr>    <dbl>     <dbl>     <dbl>   <dbl>  <dbl>
## 1 Biology & Life~ <tibb~ <lm>  Shar~  -43735.    20982.    -2.08   0.0592 0.106 
## 2 Engineering     <tibb~ <lm>  Shar~  -33912.    15418.    -2.20   0.0366 0.0937
## 3 Computers & Ma~ <tibb~ <lm>  Shar~  -28694.    18552.    -1.55   0.156  0.235 
## 4 Business        <tibb~ <lm>  Shar~  -28171.     9810.    -2.87   0.0152 0.0937
## 5 Agriculture & ~ <tibb~ <lm>  Shar~  -16263.     5975.    -2.72   0.0297 0.0937
## 6 Physical Scien~ <tibb~ <lm>  Shar~  -12820.    13349.    -0.960  0.365  0.469 
## 7 Education       <tibb~ <lm>  Shar~   -1996.     3084.    -0.647  0.528  0.594 
## 8 Humanities & L~ <tibb~ <lm>  Shar~   -1814.     4128.    -0.439  0.668  0.668 
## 9 Health          <tibb~ <lm>  Shar~   54721.    23427.     2.34   0.0416 0.0937